library(tidyverse)
library(AmesHousing)
library(recipes)
library(caret)
library(rpart)
library(rpart.plot)
library(ranger)
library(xgboost)
library(AUC)

Data prep

data("credit_data")

set.seed(42)
credit_data <- credit_data %>%
  mutate(
    base = if_else(runif(nrow(credit_data)) < 0.7, "treino", "teste")
  )

receita <- recipe(Status ~ ., data = credit_data %>% filter(base == "treino") %>% select(-base)) %>%
  step_meanimpute(all_numeric(), -all_outcomes()) %>%
  step_modeimpute(all_nominal(), -all_outcomes()) %>%
  step_dummy(all_nominal(), -all_outcomes()) %>%
  step_corr(all_predictors()) %>%
  step_nzv(all_predictors())

Árvore de decisão

getModelInfo("rpart", FALSE)$rpart
$label
[1] "CART"

$library
[1] "rpart"

$type
[1] "Regression"     "Classification"

$parameters

$grid
function(x, y, len = NULL, search = "grid"){
                    dat <- if(is.data.frame(x)) x else as.data.frame(x)
                    dat$.outcome <- y
                    initialFit <- rpart::rpart(.outcome ~ .,
                                               data = dat,
                                               control = rpart::rpart.control(cp = 0))$cptable
                    initialFit <- initialFit[order(-initialFit[,"CP"]), , drop = FALSE]
                    if(search == "grid") {
                      if(nrow(initialFit) < len) {
                        tuneSeq <- data.frame(cp = seq(min(initialFit[, "CP"]),
                                                       max(initialFit[, "CP"]),
                                                       length = len))
                      } else tuneSeq <-  data.frame(cp = initialFit[1:len,"CP"])
                      colnames(tuneSeq) <- "cp"
                    } else {
                      tuneSeq <- data.frame(cp = unique(sample(initialFit[, "CP"], size = len, replace = TRUE)))
                    }

                    tuneSeq
                  }

$loop
function(grid) {
                    grid <- grid[order(grid$cp, decreasing = FALSE),, drop = FALSE]
                    loop <- grid[1,,drop = FALSE]
                    submodels <- list(grid[-1,,drop = FALSE])
                    list(loop = loop, submodels = submodels)
                  }

$fit
function(x, y, wts, param, lev, last, classProbs, ...) {
                    cpValue <- if(!last) param$cp else 0
                    theDots <- list(...)
                    if(any(names(theDots) == "control"))
                    {
                      theDots$control$cp <- cpValue
                      theDots$control$xval <- 0
                      ctl <- theDots$control
                      theDots$control <- NULL
                    } else ctl <- rpart::rpart.control(cp = cpValue, xval = 0)

                    ## check to see if weights were passed in (and availible)
                    if(!is.null(wts)) theDots$weights <- wts

                    modelArgs <- c(list(formula = as.formula(".outcome ~ ."),
                                        data = if(is.data.frame(x)) x else as.data.frame(x),
                                        control = ctl),
                                   theDots)
                    modelArgs$data$.outcome <- y

                    out <- do.call(rpart::rpart, modelArgs)

                    if(last) out <- rpart::prune.rpart(out, cp = param$cp)
                    out
                  }

$predict
function(modelFit, newdata, submodels = NULL) {
                    if(!is.data.frame(newdata)) newdata <- as.data.frame(newdata)

                    pType <- if(modelFit$problemType == "Classification") "class" else "vector"
                    out  <- predict(modelFit, newdata, type=pType)

                    if(!is.null(submodels))
                    {
                      tmp <- vector(mode = "list", length = nrow(submodels) + 1)
                      tmp[[1]] <- out
                      for(j in seq(along = submodels$cp))
                      {
                        prunedFit <- rpart::prune.rpart(modelFit, cp = submodels$cp[j])
                        tmp[[j+1]]  <- predict(prunedFit, newdata, type=pType)
                      }
                      out <- tmp
                    }
                    out
                  }

$prob
function(modelFit, newdata, submodels = NULL) {
                    if(!is.data.frame(newdata)) newdata <- as.data.frame(newdata)
                    out <- predict(modelFit, newdata, type = "prob")

                    if(!is.null(submodels))
                    {
                      tmp <- vector(mode = "list", length = nrow(submodels) + 1)
                      tmp[[1]] <- out
                      for(j in seq(along = submodels$cp))
                      {
                        prunedFit <- rpart::prune.rpart(modelFit, cp = submodels$cp[j])
                        tmpProb <- predict(prunedFit, newdata, type = "prob")
                        tmp[[j+1]] <- as.data.frame(tmpProb[, modelFit$obsLevels, drop = FALSE])
                      }
                      out <- tmp
                    }
                    out
                  }

$predictors
function(x, surrogate = TRUE, ...)  {
                    out <- as.character(x$frame$var)
                    out <- out[!(out %in% c("<leaf>"))]
                    if(surrogate)
                    {
                      splits <- x$splits
                      splits <- splits[splits[,"adj"] > 0,]
                      out <- c(out, rownames(splits))
                    }
                    unique(out)
                  }

$varImp
function(object, surrogates = FALSE, competes = TRUE, ...) {
                    if(nrow(object$splits)>0) {
                      tmp <- rownames(object$splits)
                      rownames(object$splits) <- 1:nrow(object$splits)
                      splits <- data.frame(object$splits)
                      splits$var <- tmp
                      splits$type <- ""

                      frame <- as.data.frame(object$frame)
                      index <- 0
                      for(i in 1:nrow(frame)) {
                        if(frame$var[i] != "<leaf>") {
                          index <- index + 1
                          splits$type[index] <- "primary"
                          if(frame$ncompete[i] > 0) {
                            for(j in 1:frame$ncompete[i]) {
                              index <- index + 1
                              splits$type[index] <- "competing"
                            }
                          }
                          if(frame$nsurrogate[i] > 0) {
                            for(j in 1:frame$nsurrogate[i]) {
                              index <- index + 1
                              splits$type[index] <- "surrogate"
                            }
                          }
                        }
                      }
                      splits$var <- factor(as.character(splits$var))
                      if(!surrogates) splits <- subset(splits, type != "surrogate")
                      if(!competes) splits <- subset(splits, type != "competing")
                      out <- aggregate(splits$improve,
                                       list(Variable = splits$var),
                                       sum,
                                       na.rm = TRUE)
                    } else {
              out <- data.frame(x = numeric(), Vaiable = character())
            }
                    allVars <- colnames(attributes(object$terms)$factors)
                    if(!all(allVars %in% out$Variable)) {
                      missingVars <- allVars[!(allVars %in% out$Variable)]
                      zeros <- data.frame(x = rep(0, length(missingVars)),
                                          Variable = missingVars)
                      out <- rbind(out, zeros)
                    }
                    out2 <- data.frame(Overall = out$x)
                    rownames(out2) <- out$Variable
                    out2
                  }

$levels
function(x) x$obsLevels

$trim
function(x) {
                    x$call <- list(na.action = (x$call)$na.action)
                    x$x <- NULL
                    x$y <- NULL
                    x$where <- NULL
                    x
                  }

$tags
[1] "Tree-Based Model"              "Implicit Feature Selection"   
[3] "Handle Missing Predictor Data" "Accepts Case Weights"         

$sort
function(x) x[order(x[,1], decreasing = TRUE),]
train_control_rpart <- trainControl(
  method = "cv", 
  number = 5, 
  classProbs = TRUE,
  summaryFunction = twoClassSummary,
  verboseIter = 1 
)

# DICA: rode
# info <- getModelInfo("rpart", FALSE)$rpart
# info$parameters

grid_rpart <- data.frame(
  cp = seq(-0.001, 0.01, by= 0.0001)
)

modelo_rpart <- train(
  receita, 
  credit_data %>% filter(base == "treino") %>% select(-base), 
  method = "rpart", 
  metric = "ROC",
  trControl = train_control_rpart,
  tuneGrid = grid_rpart
)
Preparing recipe
+ Fold1: cp=-0.001 
- Fold1: cp=-0.001 
+ Fold2: cp=-0.001 
- Fold2: cp=-0.001 
+ Fold3: cp=-0.001 

Resultado

modelo_rpart
CART 

3070 samples
  13 predictor
   2 classes: 'bad', 'good' 

Recipe steps: meanimpute, modeimpute, dummy, corr, nzv 
Resampling: Cross-Validated (5 fold) 
Summary of sample sizes: 2456, 2455, 2457, 2457, 2455 
Resampling results across tuning parameters:

  cp       ROC        Sens       Spec     
  -0.0010  0.7385880  0.4976746  0.8292404
  -0.0009  0.7385880  0.4976746  0.8292404
  -0.0008  0.7385880  0.4976746  0.8292404
  -0.0007  0.7385880  0.4976746  0.8292404
  -0.0006  0.7385880  0.4976746  0.8292404
  -0.0005  0.7385880  0.4976746  0.8292404
  -0.0004  0.7385880  0.4976746  0.8292404
  -0.0003  0.7385880  0.4976746  0.8292404
  -0.0002  0.7385880  0.4976746  0.8292404
  -0.0001  0.7385880  0.4976746  0.8292404
   0.0000  0.7530080  0.4838416  0.8419563
   0.0001  0.7530080  0.4838416  0.8419563
   0.0002  0.7530080  0.4838416  0.8419563
   0.0003  0.7508616  0.4815428  0.8428633
   0.0004  0.7497139  0.4792373  0.8460431
   0.0005  0.7459665  0.4838350  0.8460431
   0.0006  0.7468441  0.4815228  0.8474067
   0.0007  0.7468441  0.4815228  0.8474067
   0.0008  0.7527045  0.4734768  0.8619470
   0.0009  0.7527045  0.4734768  0.8619470
   0.0010  0.7512653  0.4734768  0.8637652
   0.0011  0.7519638  0.4757757  0.8646722
   0.0012  0.7519638  0.4757757  0.8646722
   0.0013  0.7519638  0.4757757  0.8646722
   0.0014  0.7519638  0.4757757  0.8646722
   0.0015  0.7546083  0.4758355  0.8728458
   0.0016  0.7546083  0.4758355  0.8728458
   0.0017  0.7546083  0.4758355  0.8728458
   0.0018  0.7546083  0.4758355  0.8728458
   0.0019  0.7546083  0.4758355  0.8728458
   0.0020  0.7513883  0.4723939  0.8728458
   0.0021  0.7513883  0.4723939  0.8728458
   0.0022  0.7520362  0.4781676  0.8737580
   0.0023  0.7520362  0.4781676  0.8737580
   0.0024  0.7515843  0.4793236  0.8742125
   0.0025  0.7502133  0.4804731  0.8769336
   0.0026  0.7503254  0.4804731  0.8778407
   0.0027  0.7477346  0.4701282  0.8792012
   0.0028  0.7477346  0.4701282  0.8792012
   0.0029  0.7503338  0.4563218  0.8837374
   0.0030  0.7503338  0.4563218  0.8837374
   0.0031  0.7503338  0.4563218  0.8837374
   0.0032  0.7503338  0.4563218  0.8837374
   0.0033  0.7503338  0.4563218  0.8837374
   0.0034  0.7507174  0.4563218  0.8837374
   0.0035  0.7503456  0.4540097  0.8855556
   0.0036  0.7511207  0.4540097  0.8846465
   0.0037  0.7465225  0.4413660  0.8923593
   0.0038  0.7465225  0.4413660  0.8923593
   0.0039  0.7408048  0.4367550  0.8932684
   0.0040  0.7335255  0.4309747  0.8964502
   0.0041  0.7335255  0.4309747  0.8964502
   0.0042  0.7335255  0.4309747  0.8964502
   0.0043  0.7335255  0.4309747  0.8964502
   0.0044  0.7336564  0.4240516  0.8946403
   0.0045  0.7336564  0.4240516  0.8946403
   0.0046  0.7320066  0.4206033  0.8973614
   0.0047  0.7320066  0.4206033  0.8973614
   0.0048  0.7313576  0.4206033  0.8982705
   0.0049  0.7290796  0.4160056  0.8996341
   0.0050  0.7290796  0.4160056  0.8996341
   0.0051  0.7286288  0.4343964  0.8896547
   0.0052  0.7286288  0.4343964  0.8896547
   0.0053  0.7297652  0.4286493  0.8928293
   0.0054  0.7297652  0.4286493  0.8928293
   0.0055  0.7297652  0.4286493  0.8928293
   0.0056  0.7297652  0.4286493  0.8928293
   0.0057  0.7297652  0.4286493  0.8928293
   0.0058  0.7298794  0.4147831  0.8964616
   0.0059  0.7298794  0.4147831  0.8964616
   0.0060  0.7298794  0.4147831  0.8964616
   0.0061  0.7298794  0.4147831  0.8964616
   0.0062  0.7298794  0.4147831  0.8964616
   0.0063  0.7298794  0.4147831  0.8964616
   0.0064  0.7298794  0.4147831  0.8964616
   0.0065  0.7302964  0.4216796  0.8928334
   0.0066  0.7302964  0.4216796  0.8928334
   0.0067  0.7302964  0.4216796  0.8928334
   0.0068  0.7302964  0.4216796  0.8928334
   0.0069  0.7172084  0.4090359  0.9000897
   0.0070  0.7172084  0.4090359  0.9000897
   0.0071  0.7172084  0.4090359  0.9000897
   0.0072  0.7164385  0.4067238  0.8982715
   0.0073  0.7177774  0.4228158  0.8955442
   0.0074  0.7177774  0.4228158  0.8955442
   0.0075  0.7177774  0.4228158  0.8955442
   0.0076  0.7177774  0.4228158  0.8955442
   0.0077  0.7177774  0.4228158  0.8955442
   0.0078  0.7177774  0.4228158  0.8955442
   0.0079  0.7177774  0.4228158  0.8955442
   0.0080  0.7177774  0.4228158  0.8955442
   0.0081  0.7177774  0.4228158  0.8955442
   0.0082  0.7177774  0.4228158  0.8955442
   0.0083  0.7177774  0.4228158  0.8955442
   0.0084  0.7177774  0.4228158  0.8955442
   0.0085  0.7177774  0.4228158  0.8955442
   0.0086  0.7177774  0.4228158  0.8955442
   0.0087  0.6905127  0.3848914  0.9023645
   0.0088  0.6905127  0.3848914  0.9023645
   0.0089  0.6905127  0.3848914  0.9023645
   0.0090  0.6905127  0.3848914  0.9023645
   0.0091  0.6905127  0.3848914  0.9023645
   0.0092  0.6905127  0.3848914  0.9023645
   0.0093  0.6905127  0.3848914  0.9023645
   0.0094  0.6881358  0.3641153  0.9078149
   0.0095  0.6881358  0.3641153  0.9078149
   0.0096  0.6881358  0.3641153  0.9078149
   0.0097  0.6881358  0.3641153  0.9078149
   0.0098  0.6881358  0.3641153  0.9078149
   0.0099  0.6881358  0.3641153  0.9078149
   0.0100  0.6881358  0.3641153  0.9078149

ROC was used to select the optimal model using the largest value.
The final value used for the model was cp = 0.0019.
modelo_rpart$bestTune
varImp(modelo_rpart)
rpart variable importance
plot(modelo_rpart)

# apenas para arvores
rpart.plot(modelo_rpart$finalModel)
pdf("arvore.pdf", 20, 10)
rpart.plot(modelo_rpart$finalModel)
dev.off()
png 
  2 

# Matriz de confusão
credit_data <- credit_data %>% 
  mutate(
    pred_rpart = predict(modelo_rpart, ., type = "prob")$bad
  )

credit_data_teste <- credit_data %>% filter(base %in% "teste")
caret::confusionMatrix(
  predict(modelo_rpart, credit_data_teste), 
  credit_data_teste$Status, 
  mode = "everything"
)
Confusion Matrix and Statistics

          Reference
Prediction bad good
      bad  185  119
      good 201  879
                                          
               Accuracy : 0.7688          
                 95% CI : (0.7457, 0.7908)
    No Information Rate : 0.7211          
    P-Value [Acc > NIR] : 3.213e-05       
                                          
                  Kappa : 0.3851          
                                          
 Mcnemar's Test P-Value : 5.953e-06       
                                          
            Sensitivity : 0.4793          
            Specificity : 0.8808          
         Pos Pred Value : 0.6086          
         Neg Pred Value : 0.8139          
              Precision : 0.6086          
                 Recall : 0.4793          
                     F1 : 0.5362          
             Prevalence : 0.2789          
         Detection Rate : 0.1337          
   Detection Prevalence : 0.2197          
      Balanced Accuracy : 0.6800          
                                          
       'Positive' Class : bad             
                                          

roc_rpart$roc %>% walk(plot)

# gráfico extra ---- cuidado: códigos de R avançados!
roc_plot <- roc_rpart %>%
  select(base, roc, auc) %>%
  mutate(
    roc = map(roc, ~{
      .x %>% 
        unclass %>% 
        as.data.frame
    })
  ) %>%
  unnest %>%
  ggplot(aes(x = fpr, y = tpr, colour = base, label = cutoffs)) +
  geom_line() +
  geom_abline(colour = "grey50") +
  theme_minimal() +
  coord_fixed()

plotly::ggplotly(roc_plot)

Random Forest

infos <- getModelInfo("ranger", FALSE)$ranger
save(modelo_rf, "modelo_rf.RData")
Error in save(modelo_rf, "modelo_rf.RData") : 
  objeto ‘modelo_rf.RData’ não encontrado

Resultado

varImp(modelo_rf)
ranger variable importance
# Predicoes

credit_data <- credit_data %>% 
  mutate(
    pred_rf = predict(modelo_rf, ., type = "prob")$bad
  )
# Matriz de confusão
credit_data_teste <- credit_data %>% filter(base %in% "teste")
caret::confusionMatrix(predict(modelo_rf, credit_data_teste), credit_data_teste$Status, mode = "everything")
Confusion Matrix and Statistics

          Reference
Prediction bad good
      bad  180   79
      good 206  919
                                          
               Accuracy : 0.7941          
                 95% CI : (0.7718, 0.8151)
    No Information Rate : 0.7211          
    P-Value [Acc > NIR] : 2.703e-10       
                                          
                  Kappa : 0.4306          
                                          
 Mcnemar's Test P-Value : 8.419e-14       
                                          
            Sensitivity : 0.4663          
            Specificity : 0.9208          
         Pos Pred Value : 0.6950          
         Neg Pred Value : 0.8169          
              Precision : 0.6950          
                 Recall : 0.4663          
                     F1 : 0.5581          
             Prevalence : 0.2789          
         Detection Rate : 0.1301          
   Detection Prevalence : 0.1871          
      Balanced Accuracy : 0.6936          
                                          
       'Positive' Class : bad             
                                          
# Comparacao de modelos
rocs %>%
  ggplot(aes(x = auc, y = modelo, colour = base)) +
  geom_point(size = 5) +
  theme_minimal(30)

# gráfico extra ---- cuidado: códigos de R avançados!
roc_plot <- rocs %>%
  select(base, modelo, roc) %>%
  mutate(
    roc = map(roc, ~{
      .x %>% 
        unclass %>% 
        as.data.frame
    })
  ) %>%
  unnest %>%
  ggplot(aes(x = fpr, y = tpr, colour = modelo, label = cutoffs)) +
  geom_line() +
  geom_abline(colour = "grey50") +
  theme_minimal() +
  coord_fixed() +
  facet_wrap(~base)

plotly::ggplotly(roc_plot)

XGBoost

Exercício: Ajuste um xgboost usando o caret e responda: qual modelo apresenta a maior AUC? crtl+C ctrl+V por sua conta!

DICA 1) troque “ranger” por “xgbTree” DICA 2) rode info <- getModelInfo("xgbTree", FALSE)$xgbTree e depois consulte info$parameters. DICA 3) experimente usar o parâmetro tuneLength = 20 em vez do `tuneGrid.

getModelInfo("xgbTree", FALSE)$xgbTree
$label
[1] "eXtreme Gradient Boosting"

$library
[1] "xgboost" "plyr"   

$type
[1] "Regression"     "Classification"

$parameters

$grid
function(x, y, len = NULL, search = "grid") {
                    if(search == "grid") {
                      out <- expand.grid(max_depth = seq(1, len),
                                         nrounds = floor((1:len) * 50),
                                         eta = c(.3, .4),
                                         gamma = 0,
                                         colsample_bytree = c(.6, .8),
                                         min_child_weight = c(1),
                                         subsample = seq(.5, 1, length = len))
                    } else {
                      out <- data.frame(nrounds = sample(1:1000, size = len, replace = TRUE),
                                        max_depth = sample(1:10, replace = TRUE, size = len),
                                        eta = runif(len, min = .001, max = .6),
                                        gamma = runif(len, min = 0, max = 10),
                                        colsample_bytree = runif(len, min = .3, max = .7),
                                        min_child_weight = sample(0:20, size = len, replace = TRUE),
                                        subsample = runif(len, min = .25, max = 1))
                      out$nrounds <- floor(out$nrounds)
                      out <- out[!duplicated(out),]
                    }
                    out
                  }

$loop
function(grid) {
                    loop <- plyr::ddply(grid, c("eta", "max_depth", "gamma",
                                          "colsample_bytree", "min_child_weight",
                                          "subsample"),
                                  function(x) c(nrounds = max(x$nrounds)))
                    submodels <- vector(mode = "list", length = nrow(loop))
                    for(i in seq(along = loop$nrounds)) {
                      index <- which(grid$max_depth == loop$max_depth[i] &
                                       grid$eta == loop$eta[i] &
                                       grid$gamma == loop$gamma[i] &
                                       grid$colsample_bytree == loop$colsample_bytree[i] &
                                       grid$min_child_weight == loop$min_child_weight[i] &
                                       grid$subsample == loop$subsample[i])
                      trees <- grid[index, "nrounds"]
                      submodels[[i]] <- data.frame(nrounds = trees[trees != loop$nrounds[i]])
                    }
                    list(loop = loop, submodels = submodels)
                  }

$fit
function(x, y, wts, param, lev, last, classProbs, ...) {
                    if(!inherits(x, "xgb.DMatrix"))
                      x <- as.matrix(x)
                    
                    if(is.factor(y)) {
                      
                      if(length(lev) == 2) {
                        
                        y <- ifelse(y == lev[1], 1, 0)

                        if(!inherits(x, "xgb.DMatrix"))
                          x <- xgboost::xgb.DMatrix(x, label = y, missing = NA) else
                            xgboost::setinfo(x, "label", y)
                        
                        if (!is.null(wts))
                          xgboost::setinfo(x, 'weight', wts)
                        
                        out <- xgboost::xgb.train(list(eta = param$eta,
                                                       max_depth = param$max_depth,
                                                       gamma = param$gamma,
                                                       colsample_bytree = param$colsample_bytree,
                                                       min_child_weight = param$min_child_weight,
                                                       subsample = param$subsample),
                                                  data = x,
                                                  nrounds = param$nrounds,
                                                  objective = "binary:logistic",
                                                  ...)
                      } else {
                        
                        y <- as.numeric(y) - 1

                        if(!inherits(x, "xgb.DMatrix"))
                          x <- xgboost::xgb.DMatrix(x, label = y, missing = NA) else
                            xgboost::setinfo(x, "label", y)
                        
                        if (!is.null(wts))
                          xgboost::setinfo(x, 'weight', wts)
                        
                        out <- xgboost::xgb.train(list(eta = param$eta,
                                                       max_depth = param$max_depth,
                                                       gamma = param$gamma,
                                                       colsample_bytree = param$colsample_bytree,
                                                       min_child_weight = param$min_child_weight,
                                                       subsample = param$subsample),
                                                       data = x,
                                                       num_class = length(lev),
                                                       nrounds = param$nrounds,
                                                       objective = "multi:softprob",
                                                       ...)
                      }
                    } else {

                      if(!inherits(x, "xgb.DMatrix"))
                        x <- xgboost::xgb.DMatrix(x, label = y, missing = NA) else
                          xgboost::setinfo(x, "label", y)
                      
                      if (!is.null(wts))
                        xgboost::setinfo(x, 'weight', wts)
                      
                      out <- xgboost::xgb.train(list(eta = param$eta,
                                                     max_depth = param$max_depth,
                                                     gamma = param$gamma,
                                                     colsample_bytree = param$colsample_bytree,
                                                     min_child_weight = param$min_child_weight,
                                                     subsample = param$subsample),
                                                 data = x,
                                                 nrounds = param$nrounds,
                                                 objective = "reg:linear",
                                                 ...)
                    }
                    out
                    
                    
                  }

$predict
function(modelFit, newdata, submodels = NULL) {
                    if(!inherits(newdata, "xgb.DMatrix")) {
                      newdata <- as.matrix(newdata)
                      newdata <- xgboost::xgb.DMatrix(data=newdata, missing = NA)
                    }
                   out <- predict(modelFit, newdata)
                    if(modelFit$problemType == "Classification") {
                      if(length(modelFit$obsLevels) == 2) {
                        out <- ifelse(out >= .5,
                                      modelFit$obsLevels[1],
                                      modelFit$obsLevels[2])
                      } else {
                        out <- matrix(out, ncol = length(modelFit$obsLevels), byrow = TRUE)
                        out <- modelFit$obsLevels[apply(out, 1, which.max)]
                      }
                    }
                    
                    if(!is.null(submodels)) {
                      tmp <- vector(mode = "list", length = nrow(submodels) + 1)
                      tmp[[1]] <- out
                      for(j in seq(along = submodels$nrounds)) {
                        tmp_pred <- predict(modelFit, newdata, ntreelimit = submodels$nrounds[j])
                        if(modelFit$problemType == "Classification") {
                          if(length(modelFit$obsLevels) == 2) {
                            tmp_pred <- ifelse(tmp_pred >= .5,
                                               modelFit$obsLevels[1],
                                               modelFit$obsLevels[2])
                          } else {
                            tmp_pred <- matrix(tmp_pred, ncol = length(modelFit$obsLevels), byrow = TRUE)
                            tmp_pred <- modelFit$obsLevels[apply(tmp_pred, 1, which.max)]
                          }
                        }
                        tmp[[j+1]]  <- tmp_pred
                      }
                      out <- tmp
                    }
                    out
                  }

$prob
function(modelFit, newdata, submodels = NULL) {
                    if(!inherits(newdata, "xgb.DMatrix")) {
                      newdata <- as.matrix(newdata)
                      newdata <- xgboost::xgb.DMatrix(data=newdata, missing = NA)
                    }
                    
                    if( !is.null(modelFit$param$objective) && modelFit$param$objective == 'binary:logitraw'){
                      p <- predict(modelFit, newdata)
                      out <-binomial()$linkinv(p) # exp(p)/(1+exp(p))
                    } else {
                      out <- predict(modelFit, newdata)
                    }
                   if(length(modelFit$obsLevels) == 2) {
                     out <- cbind(out, 1 - out)
                      colnames(out) <- modelFit$obsLevels
                    } else {
                      out <- matrix(out, ncol = length(modelFit$obsLevels), byrow = TRUE)
                      colnames(out) <- modelFit$obsLevels
                    }
                    out <- as.data.frame(out)
                    
                    if(!is.null(submodels)) {
                      tmp <- vector(mode = "list", length = nrow(submodels) + 1)
                      tmp[[1]] <- out
                      for(j in seq(along = submodels$nrounds)) {
                        tmp_pred <- predict(modelFit, newdata, ntreelimit = submodels$nrounds[j])
                        if(length(modelFit$obsLevels) == 2) {
                          tmp_pred <- cbind(tmp_pred, 1 - tmp_pred)
                          colnames(tmp_pred) <- modelFit$obsLevels
                        } else {
                          tmp_pred <- matrix(tmp_pred, ncol = length(modelFit$obsLevels), byrow = TRUE)
                          colnames(tmp_pred) <- modelFit$obsLevels
                        }
                        tmp_pred <- as.data.frame(tmp_pred)
                        tmp[[j+1]]  <- tmp_pred
                      }
                      out <- tmp
                    }
                    out
                  }

$predictors
function(x, ...) {
                    imp <- xgboost::xgb.importance(x$xNames, model = x)
                    x$xNames[x$xNames %in% imp$Feature]
                  }

$varImp
function(object, numTrees = NULL, ...) {
                    imp <- xgboost::xgb.importance(object$xNames, model = object)
                    imp <- as.data.frame(imp)[, 1:2]
                    rownames(imp) <- as.character(imp[,1])
                    imp <- imp[,2,drop = FALSE]
                    colnames(imp) <- "Overall"
                    
                    missing <- object$xNames[!(object$xNames %in% rownames(imp))]
                    missing_imp <- data.frame(Overall=rep(0, times=length(missing)))
                    rownames(missing_imp) <- missing
                    imp <- rbind(imp, missing_imp)
                    
                    imp
                  }

$levels
function(x) x$obsLevels

$tags
[1] "Tree-Based Model"           "Boosting"                  
[3] "Ensemble Model"             "Implicit Feature Selection"
[5] "Accepts Case Weights"      

$sort
function(x) {
                    # This is a toss-up, but the # trees probably adds
                    # complexity faster than number of splits
                    x[order(x$nrounds, x$max_depth, x$eta, x$gamma, x$colsample_bytree, x$min_child_weight),]
                  }
train_control_xgb <- trainControl(
  method = "cv",
  number = 5,
  classProbs = TRUE,
  summaryFunction = twoClassSummary,
  verboseIter = 1,
  search = "random"
)
modelo_xgb <- train(
  receita,
  credit_data %>% filter(base %in% "treino") %>% select(-base),
  method = "xgbTree", #PREENCHA AQUI
  metric = "ROC",
  trControl = train_control_xgb,
  tuneGrid = tune_grid_xgb
)
Preparing recipe
+ Fold1: eta=0.010, max_depth=6, gamma=5.638, colsample_bytree=0.4464, min_child_weight=19, subsample=0.9128, nrounds=1200 
- Fold1: eta=0.010, max_depth=6, gamma=5.638, colsample_bytree=0.4464, min_child_weight=19, subsample=0.9128, nrounds=1200 
+ Fold1: eta=0.100, max_depth=6, gamma=5.638, colsample_bytree=0.4464, min_child_weight=19, subsample=0.9128, nrounds=1200 
- Fold1: eta=0.100, max_depth=6, gamma=5.638, colsample_bytree=0.4464, min_child_weight=19, subsample=0.9128, nrounds=1200 
+ Fold1: eta=0.146, max_depth=6, gamma=5.638, colsample_bytree=0.4464, min_child_weight=19, subsample=0.9128, nrounds=1200 
- Fold1: eta=0.146, max_depth=6, gamma=5.638, colsample_bytree=0.4464, min_child_weight=19, subsample=0.9128, nrounds=1200 
+ Fold1: eta=0.500, max_depth=6, gamma=5.638, colsample_bytree=0.4464, min_child_weight=19, subsample=0.9128, nrounds=1200 
- Fold1: eta=0.500, max_depth=6, gamma=5.638, colsample_bytree=0.4464, min_child_weight=19, subsample=0.9128, nrounds=1200 
+ Fold1: eta=1.000, max_depth=6, gamma=5.638, colsample_bytree=0.4464, min_child_weight=19, subsample=0.9128, nrounds=1200 
- Fold1: eta=1.000, max_depth=6, gamma=5.638, colsample_bytree=0.4464, min_child_weight=19, subsample=0.9128, nrounds=1200 
+ Fold2: eta=0.010, max_depth=6, gamma=5.638, colsample_bytree=0.4464, min_child_weight=19, subsample=0.9128, nrounds=1200 
- Fold2: eta=0.010, max_depth=6, gamma=5.638, colsample_bytree=0.4464, min_child_weight=19, subsample=0.9128, nrounds=1200 
+ Fold2: eta=0.100, max_depth=6, gamma=5.638, colsample_bytree=0.4464, min_child_weight=19, subsample=0.9128, nrounds=1200 
- Fold2: eta=0.100, max_depth=6, gamma=5.638, colsample_bytree=0.4464, min_child_weight=19, subsample=0.9128, nrounds=1200 
+ Fold2: eta=0.146, max_depth=6, gamma=5.638, colsample_bytree=0.4464, min_child_weight=19, subsample=0.9128, nrounds=1200 
- Fold2: eta=0.146, max_depth=6, gamma=5.638, colsample_bytree=0.4464, min_child_weight=19, subsample=0.9128, nrounds=1200 
+ Fold2: eta=0.500, max_depth=6, gamma=5.638, colsample_bytree=0.4464, min_child_weight=19, subsample=0.9128, nrounds=1200 
- Fold2: eta=0.500, max_depth=6, gamma=5.638, colsample_bytree=0.4464, min_child_weight=19, subsample=0.9128, nrounds=1200 
+ Fold2: eta=1.000, max_depth=6, gamma=5.638, colsample_bytree=0.4464, min_child_weight=19, subsample=0.9128, nrounds=1200 
- Fold2: eta=1.000, max_depth=6, gamma=5.638, colsample_bytree=0.4464, min_child_weight=19, subsample=0.9128, nrounds=1200 
+ Fold3: eta=0.010, max_depth=6, gamma=5.638, colsample_bytree=0.4464, min_child_weight=19, subsample=0.9128, nrounds=1200 

---
title: "Árvore, Random Forest e XGBoost"
output: html_notebook
---

```{r, warning=FALSE, message=FALSE}
library(tidyverse)
library(AmesHousing)
library(recipes)
library(caret)
library(rpart)
library(rpart.plot)
library(ranger)
library(xgboost)
library(AUC)
```

# Data prep

```{r}
data("credit_data")

set.seed(42)
credit_data <- credit_data %>%
  mutate(
    base = if_else(runif(nrow(credit_data)) < 0.7, "treino", "teste")
  )

receita <- recipe(Status ~ ., data = credit_data %>% filter(base == "treino") %>% select(-base)) %>%
  step_meanimpute(all_numeric(), -all_outcomes()) %>%
  step_modeimpute(all_nominal(), -all_outcomes()) %>%
  step_dummy(all_nominal(), -all_outcomes()) %>%
  step_corr(all_predictors()) %>%
  step_nzv(all_predictors())
```


# Árvore de decisão

```{r}
getModelInfo("rpart", FALSE)$rpart
```

```{r}
train_control_rpart <- trainControl(
  method = "cv", 
  number = 5, 
  classProbs = TRUE,
  summaryFunction = twoClassSummary,
  verboseIter = 1 
)

# DICA: rode
# info <- getModelInfo("rpart", FALSE)$rpart
# info$parameters

grid_rpart <- data.frame(
  cp = seq(-0.001, 0.01, by= 0.0001)
)

modelo_rpart <- train(
  receita, 
  credit_data %>% filter(base == "treino") %>% select(-base), 
  method = "rpart", 
  metric = "ROC",
  trControl = train_control_rpart,
  tuneGrid = grid_rpart
)
```

## Resultado

```{r}
modelo_rpart
modelo_rpart$bestTune
varImp(modelo_rpart)
plot(modelo_rpart)
```

```{r}
# apenas para arvores
rpart.plot(modelo_rpart$finalModel)
pdf("arvore.pdf", 20, 10)
rpart.plot(modelo_rpart$finalModel)
dev.off()
```


```{r}
# Matriz de confusão
credit_data <- credit_data %>% 
  mutate(
    pred_rpart = predict(modelo_rpart, ., type = "prob")$bad
  )

credit_data_teste <- credit_data %>% filter(base %in% "teste")
caret::confusionMatrix(
  predict(modelo_rpart, credit_data_teste), 
  credit_data_teste$Status, 
  mode = "everything"
)
```

```{r}
# Curva ROC
credit_data_teste <- credit_data %>%
  filter(base %in% "teste") %>%
  mutate(
    Status_para_roc = factor(if_else(Status == "good", 0, 1))
  ) 

roc_teste <- roc(
  credit_data_teste$pred_rpart, 
  credit_data_teste$Status_para_roc
)
AUC::auc(roc_teste)
plot(roc_teste)
```


```{r}
#curva ROC extra  ---- cuidado: códigos de R avançados!
roc_rpart <- credit_data %>%
  mutate(
    Status_para_roc = factor(if_else(Status == "good", 0, 1))
  ) %>%
  group_by(base) %>%
  nest() %>%
  mutate(
    roc = map(data, ~ AUC::roc(.x$pred_rpart, .x$Status_para_roc)),
    auc = map_dbl(roc, AUC::auc)
  )
roc_rpart
```


```{r}
roc_rpart$roc %>% walk(plot)
```

```{r}
# gráfico extra ---- cuidado: códigos de R avançados!
roc_plot <- roc_rpart %>%
  select(base, roc, auc) %>%
  mutate(
    roc = map(roc, ~{
      .x %>% 
        unclass %>% 
        as.data.frame
    })
  ) %>%
  unnest %>%
  ggplot(aes(x = fpr, y = tpr, colour = base, label = cutoffs)) +
  geom_line() +
  geom_abline(colour = "grey50") +
  theme_minimal() +
  coord_fixed()

plotly::ggplotly(roc_plot)
```


# Random Forest 
```{r}
infos <- getModelInfo("ranger", FALSE)$ranger

```

```{r}
train_control_rf <- trainControl(
  method = "cv",
  number = 5,
  classProbs = TRUE,
  summaryFunction = twoClassSummary,
  verboseIter = 1
)

# DICA: rode
# info <- getModelInfo("ranger", FALSE)$ranger
# info$parameters
# 
# grid_rf <- expand.grid(
#   mtry = c(2, 4, 6), # PREENCHA AQUI
#   min.node.size = seq(10, 100, by = 20),
#   splitrule = "gini"
# )

modelo_rf <- train(
  receita,
  credit_data %>% filter(base %in% "treino") %>% select(-base),
  method = "ranger", #PREENCHA AQUI
  importance = "permutation",
  metric = "ROC",
  trControl = train_control_rf,
  tuneLength = 20
)

save(modelo_rf, file = "modelo_rf.RData")

load("modelo_rf.RData")
```

## Resultado
```{r}
modelo_rf
modelo_rf$bestTune
varImp(modelo_rf)
plot(modelo_rf)
```

```{r}
# Predicoes

credit_data <- credit_data %>% 
  mutate(
    pred_rf = predict(modelo_rf, ., type = "prob")$bad
  )
```


```{r}
# Matriz de confusão
credit_data_teste <- credit_data %>% filter(base %in% "teste")
caret::confusionMatrix(predict(modelo_rf, credit_data_teste), credit_data_teste$Status, mode = "everything")
```

```{r}
#curva ROC  ---- cuidado: códigos de R avançados!
rocs <- credit_data %>%
  mutate(
    Status_para_roc = factor(if_else(Status == "good", 0, 1))
  ) %>%
  select(base, Status_para_roc, starts_with("pred")) %>%
  gather(modelo, valor_predito, starts_with("pred")) %>%
  group_by(base, modelo) %>%
  nest() %>%
  mutate(
    roc = map(data, ~ AUC::roc(.x$valor_predito, .x$Status_para_roc)),
    auc = map_dbl(roc, AUC::auc)
  )

rocs
```

```{r}
# Comparacao de modelos
rocs %>%
  ggplot(aes(x = auc, y = modelo, colour = base)) +
  geom_point(size = 5) +
  theme_minimal(30)
```


```{r}
# gráfico extra ---- cuidado: códigos de R avançados!
roc_plot <- rocs %>%
  select(base, modelo, roc) %>%
  mutate(
    roc = map(roc, ~{
      .x %>% 
        unclass %>% 
        as.data.frame
    })
  ) %>%
  unnest %>%
  ggplot(aes(x = fpr, y = tpr, colour = modelo, label = cutoffs)) +
  geom_line() +
  geom_abline(colour = "grey50") +
  theme_minimal() +
  coord_fixed() +
  facet_wrap(~base)

plotly::ggplotly(roc_plot)
```





# XGBoost

Exercício: Ajuste um xgboost usando o caret e responda: qual modelo apresenta a maior AUC? crtl+C ctrl+V por sua conta!

DICA 1) troque "ranger" por "xgbTree"
DICA 2) rode `info <- getModelInfo("xgbTree", FALSE)$xgbTree` e depois consulte `info$parameters`.
DICA 3) experimente usar o parâmetro `tuneLength = 20` em vez do ``tuneGrid`.

```{r}
getModelInfo("xgbTree", FALSE)$xgbTree
```


```{r}
train_control_xgb <- trainControl(
  method = "cv",
  number = 5,
  classProbs = TRUE,
  summaryFunction = twoClassSummary,
  verboseIter = 1,
  search = "grid"
)
```




```{r}
tune_grid_xgb <- expand.grid(
  nrounds          = 500,
  max_depth        = 6,
  eta              = 0.01,
  gamma            = 5.6384934,
  colsample_bytree = 0.4464347,
  min_child_weight = 19.0000000,
  subsample        = 0.9128172
)

modelo_xgb <- train(
  receita,
  credit_data %>% filter(base %in% "treino") %>% select(-base),
  method = "xgbTree", #PREENCHA AQUI
  metric = "ROC",
  trControl = train_control_xgb,
  tuneGrid = tune_grid_xgb
)

modelo_xgb$bestTune %>% t
```

```{r}
# Predicoes

credit_data <- credit_data %>% 
  mutate(
    pred_xgb = predict(modelo_xgb, ., type = "prob")$bad
  )

credit_data %>% select(starts_with("pred"), everything())
```


```{r}
#curva ROC  ---- cuidado: códigos de R avançados!
rocs <- credit_data %>%
  mutate(
    Status_para_roc = factor(if_else(Status == "good", 0, 1))
  ) %>%
  select(base, Status_para_roc, starts_with("pred")) %>%
  gather(modelo, valor_predito, starts_with("pred")) %>%
  group_by(base, modelo) %>%
  nest() %>%
  mutate(
    roc = map(data, ~ AUC::roc(.x$valor_predito, .x$Status_para_roc)),
    auc = map_dbl(roc, AUC::auc)
  )

rocs
```

```{r}
# Comparacao de modelos
rocs %>%
  ggplot(aes(x = auc, y = modelo, colour = base)) +
  geom_point(size = 5) +
  theme_minimal(30)
```


```{r}
# gráfico extra ---- cuidado: códigos de R avançados!
roc_plot <- rocs %>%
  select(base, modelo, roc) %>%
  mutate(
    roc = map(roc, ~{
      .x %>% 
        unclass %>% 
        as.data.frame
    })
  ) %>%
  unnest %>%
  ggplot(aes(x = fpr, y = tpr, colour = modelo, label = cutoffs)) +
  geom_line() +
  geom_abline(colour = "grey50") +
  theme_minimal() +
  coord_fixed() +
  facet_wrap(~base)

plotly::ggplotly(roc_plot)
```